ML vs. DNN in Housing Price Prediction
Introduction paragraph will be included which will briefly introduce what this project is about. What datasets we used? What machine learning algorithms we utilized. What deep learning algorithm I have used and how I try to improve its performance using hyperparameter tuning. Brief description of study results.
#Bringing necessary library
library(dplyr)
library(ggplot2)
library(rsample)
library(caret)
library(h2o)
library(visdat)
library(recipes)
library(modeldata)
library(ggplot2)
library(plyr)
library(PupillometryR)
library(leaps)
library(MASS)
library(car)
library(glmnet)
library(vip)
library(pls)
library(keras)
library(tfruns)
library(tfestimators)
library(tensorflow)
library(reticulate)
library(remotes)
library(recipes)
library(randomForest)
library(tidyverse)
library(ggmap)
library(sf)
library(RColorBrewer)
library(dplyr)
library(DiagrammeR)
Paragraph about dataset description - Ames Housing Dataset and Boston Housing Dataset
#Brining Ames Housing dataset
ames <- AmesHousing::make_ames()
names(ames)
## [1] "MS_SubClass" "MS_Zoning" "Lot_Frontage"
## [4] "Lot_Area" "Street" "Alley"
## [7] "Lot_Shape" "Land_Contour" "Utilities"
## [10] "Lot_Config" "Land_Slope" "Neighborhood"
## [13] "Condition_1" "Condition_2" "Bldg_Type"
## [16] "House_Style" "Overall_Qual" "Overall_Cond"
## [19] "Year_Built" "Year_Remod_Add" "Roof_Style"
## [22] "Roof_Matl" "Exterior_1st" "Exterior_2nd"
## [25] "Mas_Vnr_Type" "Mas_Vnr_Area" "Exter_Qual"
## [28] "Exter_Cond" "Foundation" "Bsmt_Qual"
## [31] "Bsmt_Cond" "Bsmt_Exposure" "BsmtFin_Type_1"
## [34] "BsmtFin_SF_1" "BsmtFin_Type_2" "BsmtFin_SF_2"
## [37] "Bsmt_Unf_SF" "Total_Bsmt_SF" "Heating"
## [40] "Heating_QC" "Central_Air" "Electrical"
## [43] "First_Flr_SF" "Second_Flr_SF" "Low_Qual_Fin_SF"
## [46] "Gr_Liv_Area" "Bsmt_Full_Bath" "Bsmt_Half_Bath"
## [49] "Full_Bath" "Half_Bath" "Bedroom_AbvGr"
## [52] "Kitchen_AbvGr" "Kitchen_Qual" "TotRms_AbvGrd"
## [55] "Functional" "Fireplaces" "Fireplace_Qu"
## [58] "Garage_Type" "Garage_Finish" "Garage_Cars"
## [61] "Garage_Area" "Garage_Qual" "Garage_Cond"
## [64] "Paved_Drive" "Wood_Deck_SF" "Open_Porch_SF"
## [67] "Enclosed_Porch" "Three_season_porch" "Screen_Porch"
## [70] "Pool_Area" "Pool_QC" "Fence"
## [73] "Misc_Feature" "Misc_Val" "Mo_Sold"
## [76] "Year_Sold" "Sale_Type" "Sale_Condition"
## [79] "Sale_Price" "Longitude" "Latitude"
dim(ames)
## [1] 2930 81
#Bringing Boston Housing Dataset
boston <- Boston
names(Boston)
## [1] "crim" "zn" "indus" "chas" "nox" "rm" "age"
## [8] "dis" "rad" "tax" "ptratio" "black" "lstat" "medv"
dim(boston)
## [1] 506 14
DiagrammeR::grViz("
digraph surveillance_diagram { # 'digraph' means 'directional graph', then the graph name
# graph statement
graph [layout = dot,
rankdir = LR, # layout left-to-right
fontsize = 13]
# nodes (circles)
node [shape = circle, # shape = circle
fixedsize = true
width = 2.1,
height=1.5,
fontsize=19,
fontname = 'Helvetica-Bold',
]
# individual component
ames_data [label = 'Ames\nhousing data', shape = cylinder, fontcolor='#3caea3', color='#f6d55c']
divide_data [label = 'Divide Ames\ndata by types', shape = rect,fontcolor='#3caea3' color='#3caea3']
numeric [label = 'Numeric \n variables', shape = rect,fontcolor='#3caea3' color='#3caea3']
discrete [label = 'Discrete \n variables', shape = rect, fontcolor='#3caea3' color='#3caea3']
ordinal [label = 'Ordinal \n variables', shape = rect, fontcolor='#3caea3' color='#3caea3']
categorical [label = 'Categorical \n variables', shape = rect, fontcolor='#3caea3' color='#3caea3']
reorder [label = 'Reorder\nordinal\nvariables', shape = rect, fontcolor='#3caea3' color='#3caea3']
one_hot [label = 'One-Hot\nEncoding', shape = rect, fontcolor='#3caea3' color='#3caea3']
training_set [label = '70%\n training nset', shape = rect,fontcolor='#3caea3' color='#3caea3']
testing_set [label = '30% \ntesting\nvalidation set', shape = rect, fontcolor='#3caea3' color='#3caea3']
standardize [label = 'Standardize\nthe dataset', shape = rect, fontcolor='#3caea3' color='#3caea3']
eda [label = 'EDA', shape=oval, height=1.1, fontcolor='#20639b', color='#20639b']
gis [label = 'GIS\nvisualization', shape=oval, fontcolor='#20639b', color='#20639b']
machine_learning [label = 'Machine\nLearning\nmodels', shape=octagon, fontcolor = '#ed553b', color='#ed553b']
deep_learning [label = 'Deep\nNeural\nNetwork', shape=doubleoctagon, fontcolor = '#ed553b', color='#ed553b']
hyper_tuning [label = 'hyper-\nparameter\ntuning', shape=octagon, fontcolor = '#ed553b', color='#ed553b']
performance [label = 'Performance\nComparison', fontcolor = '#ed553b', color='#ed553b']
best_model [label = 'Select\nbest performing\nmodel',width=2.2, height=2.2 ,shape=tripleoctagon,fontcolor = '#173F5F', color='#173F5F']
# Machine Learning Models
mlr [label = 'Multiple\nLinear\nRegression', fontcolor = '#ed553b', color='#ed553b']
regularization [label = 'Regulari-\nzation:\nLasso\nRidge\nENET', fontcolor = '#ed553b', color='#ed553b']
tree_based [label = 'Tree-based:\nRandom Forest\nBagging', fontcolor = '#ed553b', color='#ed553b']
dim_reduction [label = 'Dimension\nreduction:\nPCR\nPLSR',fontcolor = '#ed553b', color='#ed553b']
#linking the components:
ames_data -> divide_data -> {numeric discrete ordinal categorical}[style = dashed, color = '#3caea3']
ordinal -> reorder[style = dashed, color = '#3caea3']
categorical -> one_hot[style = dashed, color = '#3caea3']
{one_hot reorder discrete numeric} -> training_set[style = dashed, color = '#3caea3']
{one_hot reorder discrete numeric} -> testing_set[style = dashed, color = '#3caea3']
{training_set testing_set} -> standardize[style = dashed, color = '#3caea3']
standardize -> {eda gis}[style = dashed, color = '#20639b']
{eda gis} -> machine_learning -> {mlr regularization tree_based dim_reduction}[style = dashed, color = '#ed553b']
{eda gis} -> hyper_tuning -> deep_learning[style = dashed, color = '#ed553b']
{mlr regularization tree_based dim_reduction deep_learning} -> performance -> best_model[style = dashed, color = '#ed553b']
}")
Before developing statistical models, the original dataset needed to be pre-processed by employing the following steps:
brief outlining of this section- simply state how things are organized
#Extracting Numeric Variables
ames_numeric <- ames %>% dplyr::select(Lot_Frontage, Lot_Area, Year_Built,
Year_Remod_Add, Mas_Vnr_Area, BsmtFin_SF_1,
BsmtFin_SF_2, Bsmt_Unf_SF, Total_Bsmt_SF,
First_Flr_SF, Second_Flr_SF, Low_Qual_Fin_SF,
Gr_Liv_Area, Garage_Area, Wood_Deck_SF,
Open_Porch_SF, Enclosed_Porch, Three_season_porch,
Screen_Porch, Pool_Area, Misc_Val)
#Extracting Discrete Variables
ames_discrete <- ames %>% dplyr::select(Bsmt_Full_Bath, Bsmt_Half_Bath, Full_Bath, Half_Bath,
Bedroom_AbvGr, Kitchen_AbvGr,
TotRms_AbvGrd, Fireplaces,Garage_Cars,
Mo_Sold, Year_Sold)
#Extracting Categorical Variables
ames_categorical <- ames %>% dplyr::select(MS_SubClass, MS_Zoning, Street,
Alley, Land_Contour,
Lot_Config,
Neighborhood, Condition_1, Condition_2,
Bldg_Type, House_Style,
Roof_Matl, Exterior_1st,
Exterior_2nd, Mas_Vnr_Type,
Foundation,
Heating,
Central_Air,
Garage_Type,
Misc_Feature,
Sale_Type, Sale_Condition)
#Extracting Ordinal Variables
ames_ordinal <- ames %>% dplyr::select(Lot_Shape, Land_Slope,
Overall_Qual, Overall_Cond,
Exter_Qual, Exter_Cond,
Bsmt_Qual, Bsmt_Cond, Bsmt_Exposure,
BsmtFin_Type_1, BsmtFin_Type_2,
Heating_QC, Electrical, Kitchen_Qual,
Functional, Fireplace_Qu, Garage_Finish,
Garage_Qual,Garage_Cond,
Paved_Drive, Pool_QC, Fence)
#Extracting Response Variable
ames_response <- ames %>% dplyr::select(Sale_Price)
#Lot_Shape
Qual.levels <- c('Irregular', 'Moderately_Irregular', 'Slightly_Irregular','Regular')
ames_ordinal$Lot_Shape <- factor(ames_ordinal$Lot_Shape, levels = Qual.levels, ordered = TRUE)
ames_ordinal$Lot_Shape <- as.integer(ames_ordinal$Lot_Shape)
#Land_Slope
Qual.levels <- c('Gtl', 'Mod', 'Sev')
ames_ordinal$Land_Slope <- factor(ames_ordinal$Land_Slope, levels = Qual.levels, ordered = TRUE)
ames_ordinal$Land_Slope <- as.integer(ames_ordinal$Land_Slope)
#Overall_Qual
Qual.levels <- c('Very_Poor', 'Poor', 'Fair', 'Below_Average', 'Average',
'Above_Average', 'Good', 'Very_Good', 'Excellent', 'Very_Excellent')
ames_ordinal$Overall_Qual <- factor(ames_ordinal$Overall_Qual, levels = Qual.levels, ordered = TRUE)
ames_ordinal$Overall_Qual <- as.integer(ames_ordinal$Overall_Qual)
#Overall_Cond
ames_ordinal$Overall_Cond <- factor(ames_ordinal$Overall_Cond, levels = Qual.levels, ordered = TRUE)
ames_ordinal$Overall_Cond <- as.integer(ames_ordinal$Overall_Cond)
#Exter_Qual
Qual.levels <- c('Fair', 'Typical', 'Good', 'Excellent')
ames_ordinal$Exter_Qual <- factor(ames_ordinal$Exter_Qual, levels = Qual.levels, ordered = TRUE)
ames_ordinal$Exter_Qual <- as.integer(ames_ordinal$Exter_Qual)
#Exter_Cond
Qual.levels <- c('Poor','Fair', 'Typical', 'Good', 'Excellent')
ames_ordinal$Exter_Cond <- factor(ames_ordinal$Exter_Cond, levels = Qual.levels, ordered = TRUE)
ames_ordinal$Exter_Cond <- as.integer(ames_ordinal$Exter_Cond)
#Bsmt_Qual
Qual.levels <- c('No_Basement','Poor','Fair', 'Typical', 'Good', 'Excellent')
ames_ordinal$Bsmt_Qual <- factor(ames_ordinal$Bsmt_Qual, levels = Qual.levels, ordered = TRUE)
ames_ordinal$Bsmt_Qual <- as.integer(ames_ordinal$Bsmt_Qual)
#Bsmt_Cond
Qual.levels <- c('No_Basement','Poor','Fair', 'Typical', 'Good', 'Excellent')
ames_ordinal$Bsmt_Cond <- factor(ames_ordinal$Bsmt_Cond, levels = Qual.levels, ordered = TRUE)
ames_ordinal$Bsmt_Cond <- as.integer(ames_ordinal$Bsmt_Cond)
#Bsmt_Exposure
Qual.levels <- c('No_Basement','No','Mn', 'Av', 'Gd')
ames_ordinal$Bsmt_Exposure <- factor(ames_ordinal$Bsmt_Exposure, levels = Qual.levels, ordered = TRUE)
ames_ordinal$Bsmt_Exposure <- as.integer(ames_ordinal$Bsmt_Exposure)
#BsmtFin_Type_1
Qual.levels <- c('No_Basement','Unf','LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ')
ames_ordinal$BsmtFin_Type_1 <- factor(ames_ordinal$BsmtFin_Type_1, levels = Qual.levels, ordered = TRUE)
ames_ordinal$BsmtFin_Type_1 <- as.integer(ames_ordinal$BsmtFin_Type_1)
#BsmtFin_Type_2
Qual.levels <- c('No_Basement','Unf','LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ')
ames_ordinal$BsmtFin_Type_2 <- factor(ames_ordinal$BsmtFin_Type_2, levels = Qual.levels, ordered = TRUE)
ames_ordinal$BsmtFin_Type_2 <- as.integer(ames_ordinal$BsmtFin_Type_2)
#Heating_QC
Qual.levels <- c('Poor','Fair', 'Typical', 'Good', 'Excellent')
ames_ordinal$Heating_QC <- factor(ames_ordinal$Heating_QC, levels = Qual.levels, ordered = TRUE)
ames_ordinal$Heating_QC <- as.integer(ames_ordinal$Heating_QC)
#Electrical
Qual.levels <- c('Mix', 'FuseP', 'FuseF', 'FuseA', 'SBrkr')
ames_ordinal$Electrical[ames$Electrical == 'Unknown'] <- 'SBrkr'
ames_ordinal$Electrical <- factor(ames_ordinal$Electrical, levels = Qual.levels, ordered = TRUE)
ames_ordinal$Electrical <- as.integer(ames_ordinal$Electrical)
#Kitchen_Qual
Qual.levels <- c('Poor','Fair', 'Typical', 'Good', 'Excellent')
ames_ordinal$Kitchen_Qual <- factor(ames_ordinal$Kitchen_Qual, levels = Qual.levels, ordered = TRUE)
ames_ordinal$Kitchen_Qual <- as.integer(ames_ordinal$Kitchen_Qual)
#Functional
Qual.levels <- c('Sal','Sev','Maj2','Maj1','Mod','Min2','Min1','Typ')
ames_ordinal$Functional <- factor(ames_ordinal$Functional, levels = Qual.levels, ordered = TRUE)
ames_ordinal$Functional <- as.integer(ames_ordinal$Functional)
#Fireplace_Qu
Qual.levels <- c('No_Fireplace','Poor','Fair', 'Typical', 'Good', 'Excellent')
ames_ordinal$Fireplace_Qu <- factor(ames_ordinal$Fireplace_Qu, levels = Qual.levels, ordered = TRUE)
ames_ordinal$Fireplace_Qu <- as.integer(ames_ordinal$Fireplace_Qu)
#Garage_Finish
Qual.levels <- c('No_Garage','Unf','RFn', 'Fin')
ames_ordinal$Garage_Finish <- factor(ames_ordinal$Garage_Finish, levels = Qual.levels, ordered = TRUE)
ames_ordinal$Garage_Finish <- as.integer(ames_ordinal$Garage_Finish)
#Garage_Qual
Qual.levels <- c('No_Garage','Poor','Fair', 'Typical', 'Good', 'Excellent')
ames_ordinal$Garage_Qual <- factor(ames_ordinal$Garage_Qual, levels = Qual.levels, ordered = TRUE)
ames_ordinal$Garage_Qual <- as.integer(ames_ordinal$Garage_Qual)
#Garage_Cond
Qual.levels <- c('No_Garage','Poor','Fair', 'Typical', 'Good', 'Excellent')
ames_ordinal$Garage_Cond <- factor(ames_ordinal$Garage_Cond, levels = Qual.levels, ordered = TRUE)
ames_ordinal$Garage_Cond <- as.integer(ames_ordinal$Garage_Cond)
#Paved_Drive
Qual.levels <- c('Dirt_Gravel','Partial_Pavement','Paved')
ames_ordinal$Paved_Drive <- factor(ames_ordinal$Paved_Drive, levels = Qual.levels, ordered = TRUE)
ames_ordinal$Paved_Drive <- as.integer(ames_ordinal$Paved_Drive)
#Pool_QC
Qual.levels <- c('No_Pool','Poor','Fair', 'Typical', 'Good', 'Excellent')
ames_ordinal$Pool_QC <- factor(ames_ordinal$Pool_QC, levels = Qual.levels, ordered = TRUE)
ames_ordinal$Pool_QC <- as.integer(ames_ordinal$Pool_QC)
#Fence
Qual.levels <- c('No_Fence','Minimum_Wood_Wire','Good_Wood', 'Minimum_Privacy', 'Good_Privacy')
ames_ordinal$Fence <- factor(ames_ordinal$Fence, levels = Qual.levels, ordered = TRUE)
ames_ordinal$Fence <- as.integer(ames_ordinal$Fence)
glimpse(ames_ordinal)
## Rows: 2,930
## Columns: 22
## $ Lot_Shape <int> 3, 4, 3, 4, 3, 3, 4, 3, 3, 4, 3, 3, 3, 4, 3, 2, 3, 4, 4…
## $ Land_Slope <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1…
## $ Overall_Qual <int> 6, 5, 6, 7, 5, 6, 8, 8, 8, 7, 6, 6, 6, 7, 8, 8, 8, 9, 4…
## $ Overall_Cond <int> 5, 6, 6, 5, 5, 6, 5, 5, 5, 5, 5, 7, 5, 5, 5, 5, 7, 2, 5…
## $ Exter_Qual <int> 2, 2, 2, 3, 2, 2, 3, 3, 3, 2, 2, 2, 2, 2, 3, 4, 3, 3, 2…
## $ Exter_Cond <int> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, 3…
## $ Bsmt_Qual <int> 4, 4, 4, 4, 5, 4, 5, 5, 5, 4, 5, 5, 5, 5, 5, 5, 5, 6, 4…
## $ Bsmt_Cond <int> 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4…
## $ Bsmt_Exposure <int> 5, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 5, 4, 5, 4, 4, 2…
## $ BsmtFin_Type_1 <int> 5, 4, 6, 6, 7, 7, 7, 6, 7, 2, 2, 6, 2, 7, 7, 6, 7, 7, 3…
## $ BsmtFin_Type_2 <int> 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 5, 2, 2, 2, 2…
## $ Heating_QC <int> 2, 3, 3, 5, 4, 5, 5, 5, 5, 4, 4, 5, 4, 4, 3, 5, 4, 5, 5…
## $ Electrical <int> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5…
## $ Kitchen_Qual <int> 3, 3, 4, 5, 3, 4, 4, 4, 4, 4, 3, 3, 3, 4, 4, 5, 3, 5, 3…
## $ Functional <int> 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 5, 8, 8, 8…
## $ Fireplace_Qu <int> 5, 1, 1, 4, 4, 5, 1, 1, 4, 4, 4, 1, 5, 2, 1, 5, 1, 6, 1…
## $ Garage_Finish <int> 4, 2, 2, 4, 4, 4, 4, 3, 3, 4, 4, 4, 4, 2, 3, 4, 3, 4, 2…
## $ Garage_Qual <int> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4…
## $ Garage_Cond <int> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4…
## $ Paved_Drive <int> 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3…
## $ Pool_QC <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ Fence <int> 1, 4, 1, 1, 4, 1, 1, 1, 1, 1, 1, 5, 1, 1, 1, 1, 1, 1, 1…
ames_numeric <- cbind(ames_ordinal, ames_numeric, ames_discrete)
n= nrow(ames_numeric)
train_index = sample(1:n, n*0.7, replace = FALSE)
ames_numeric_train <- ames_numeric[train_index, ]
ames_numeric_test <- ames_numeric[-train_index, ]
#calculating mean and standard deviation from training data only for no prior info leak before model training
mean_train <- apply(ames_numeric_train, 2, mean)
sd_train <- apply(ames_numeric_train, 2, sd)
#scaling training and testing data
ames_numeric_train <- scale(ames_numeric_train, center=mean_train, scale=sd_train)
ames_numeric_test <- scale(ames_numeric_test, center=mean_train, scale=sd_train) #now numeric data are scaled properly
The table below displays the resulting mean and standard deviation of pre-processed training dataset. As can be seen, all the numeric variables has mean of 0 and standard deviation of 1.
The following table shows the mean and standard deviation for “testing” dataset. Unlike the training dataset, the means and standard deviations are not exactly 0 and 1, but fairly close. This, as mentioned above, was done on purpose to ensure no information leak from the training dataset which will be used for DNN training.
Machine Learning algorithms cannot use categorical variables directly as input. Instead, all data must be presented in a numerical format that machines can properly understand and process them. One-hot encoding is one of the most commonly used methods to achieve this. The following code demonstrates how I applied this technique to convert all my categorical variables into binary vectors, and the accompanying results below show that all variables have been processed as intended.
dummy_formula <- dummyVars(~., data=ames_categorical)
ames_categorical_one_hot <- predict(dummy_formula, newdata=ames_categorical)
ames_categorical_train <- ames_categorical_one_hot[train_index,]
ames_categorical_test <- ames_categorical_one_hot[-train_index,]
## Rows: 2,051
## Columns: 5
## $ MS_SubClass.One_Story_1946_and_Newer_All_Styles <dbl> 0, 1, 0, 1, 0, 0, 0…
## $ MS_SubClass.One_Story_1945_and_Older <dbl> 0, 0, 0, 0, 0, 0, 0…
## $ MS_SubClass.One_Story_with_Finished_Attic_All_Ages <dbl> 0, 0, 0, 0, 0, 0, 0…
## $ MS_SubClass.One_and_Half_Story_Unfinished_All_Ages <dbl> 0, 0, 0, 0, 0, 0, 0…
## $ MS_SubClass.One_and_Half_Story_Finished_All_Ages <dbl> 0, 0, 0, 0, 1, 0, 0…
Now that all the variables in the Ames housing dataset has been properly pre-processed. As a final step, I am going to organize the exploratory and response variables into separate training and testing datasets. The results from the ‘dim()’ function demonstrates that we are using 70% of data as training set, while the remaining 30% is reserved for testing and validation. We are now ready to move on to the more exciting parts of our analysis: Exploratory Data Analysis(EDA) and actual machine learning!
ames_train_targets <- ames$Sale_Price[train_index]
ames_test_targets <- ames$Sale_Price[-train_index]
ames_DN_train <- cbind(ames_numeric_train, ames_categorical_train)
ames_DN_test <- cbind(ames_numeric_test, ames_categorical_test)
dim(as.data.frame(ames_DN_train))
## [1] 2051 239
dim(as.data.frame(ames_DN_test))
## [1] 879 239
Explanation about EDA
#Brining the shapefile for visualization
ames_road <- st_read('../tl_2023_19169_roads', quiet=TRUE, layer="tl_2023_19169_roads")
#Utilizing color brewer palette
nb.cols <- 29
myColors <- colorRampPalette(brewer.pal(8, "Set2"))(nb.cols)
names(myColors) <- levels(ames$Neighborhood)
custom_colors <- scale_colour_manual(name = "Neighborhood", values = myColors)
#Settting the limit for lon, lat
xmin <- -93.70
xmax <- -93.60
ymin <- 41.99
ymax <- 42.06
#preparing a backgeround map
background <- ggplot(ames_road) + geom_sf(color="grey") +
xlim(xmin, xmax) + ylim(ymin, ymax)
#Visualizing the Ames neighborhoods
background + geom_point(data=ames,mapping = aes(x=Longitude, y=Latitude, colour = Neighborhood), size=1.4) + custom_colors
hist_saleprice <-
ggplot(ames, aes(x=Sale_Price)) + geom_histogram(bins=50, col="white")
ames$Price_Category <- cut(ames$Sale_Price,
breaks = c(-Inf, 100000, 200000, 300000, 400000, Inf),
label = c("Very Low", "Low", "Medium", "High", "Very High"))
custom_palette <- c("Very Low"="blue", "Low"="lightblue", "Medium"="yellow", "High"="orange", "Very High"="red")
map_saleprice<-
background +
geom_point(data=ames, mapping = aes(x=Longitude, y=Latitude, colour = Price_Category), size=1.4) +
scale_color_manual(values = custom_palette) +
guides(colour=guide_legend(nrow=2)) +
theme(legend.position = "bottom",
legend.title = element_blank(),
legend.text = element_text(size = 9))
hist_saleprice + map_saleprice + plot_layout(ncol = 2)
hist_lot <- ggplot(ames, aes(x=Lot_Area)) +
geom_histogram(bins=50, col="white") +
xlim(0, 50000) #Lot_Area (Continuous): Lot size in square feet
lot_price_scatter <-
ggplot(ames, aes(x=Lot_Area, y=Sale_Price)) + geom_point() +
xlim(0, 30000) +
geom_smooth(se=TRUE)
hist_lot + lot_price_scatter + plot_layout(ncol = 2)
#scatter plot between Gr_Liv_Area and Sale_Price
Gr_Liv_SalePrice_scatter <- ggplot(ames, aes(x=Gr_Liv_Area, y=Sale_Price)) + geom_point() + geom_smooth(se=TRUE)
#defining class breaks based on Gr_Liv_Area
ames$Gr_Liv_Area_category <- cut(ames$Gr_Liv_Area, breaks = c(0, 500, 1500, 2000, 2500, 5642),
labels = c("< 500", "500-1499", "1500-1999", "2000-2499", "2500+"),
include.lowest = TRUE,
right=FALSE)
#defining a custom palette
custom_palette <- c("< 500" = "blue", "500-1499" = "lightblue", "1500-1999" = "yellow", "2000-2499" = "orange", "2500+" = "red")
#Creatring a map showing the spatial distribution of Gr_Liv_Area
map_Gr_Liv_Area<-
background +
geom_point(data=ames, mapping = aes(x=Longitude, y=Latitude, colour = Gr_Liv_Area_category), size=1.4) +
scale_color_manual(values = custom_palette) +
guides(colour=guide_legend(nrow=2)) +
theme(legend.position = "bottom",
legend.title = element_blank(),
legend.text = element_text(size = 9))
Gr_Liv_SalePrice_scatter + map_Gr_Liv_Area + plot_layout(ncol = 2)
#histogram for Year_Built
hist_year_built <- ggplot(ames, aes(x=Year_Built)) + geom_histogram(bins=45, col="white")
#scatter plot for Year_Built vs. Sale_Price
year_built_saleprice_scatter <- ggplot(ames, aes(x=Year_Built, y=Sale_Price)) + geom_point() + geom_smooth(se=TRUE)
#Defining class breaks for a Year_Built variable
ames$year_built_category <- cut(ames$Year_Built, breaks = c(-Inf, 1900, 1930, 1960, 1990, 2000, Inf),
label = c("before_1900", "00_30", "30_60", "60_90", "90_20", "after_20"))
#defining a custom palette
custom_palette <- c("before_1900"="#ffffcc", "00_30"="#c7e9b4", "30_60"="#7fcdbb", "60_90"="#41b6c4", "90_20"="#2c7fb8", "after_20"="#253494")
#Spatial Distribution of Year_Built
map_year_built <-
background +
geom_point(data=ames, mapping = aes(x=Longitude, y=Latitude, colour = year_built_category), size=1.4) +
scale_color_manual(values = custom_palette)
boxplot_condition_price <- ggplot(ames, aes(x=Overall_Qual, y=Sale_Price)) + geom_boxplot(aes(colour=Overall_Qual), show.legend = FALSE)
map_condition <-
background +
geom_point(data=ames, mapping = aes(x=Longitude, y=Latitude, colour = Overall_Qual), size=0.8) +
scale_color_brewer(palette = "Spectral")
boxplot_condition_price + map_condition + plot_layout(ncol=2)
boxplot_external_qual_price <-
ggplot(ames, aes(x=Exter_Qual, y=Sale_Price)) +
geom_boxplot(aes(colour=Exter_Qual), show.legend = FALSE)
map_external_qual <-
background + geom_point(data=ames, mapping = aes(x=Longitude, y=Latitude, colour = Exter_Qual), size=0.8) +
scale_color_brewer(palette = "Spectral")+
guides(color = guide_legend(title = NULL))
boxplot_external_qual_price + map_external_qual + plot_layout(ncol=2)
Explanation about Performance Comparison of Machine Learning(ML) Models for Housing Price Prediction
n = nrow(ames)
rep = 15
#creating empty vector to save training & testing error for each model
mse.train.model3 = mse.train.lasso = mse.train.ridge =
mse.train.enet = mse.train.rf= mse.train.bag =
mse.train.pcr = mse.train.plsr = rep(0,15)
mse.test.model3 = mse.test.lasso = mse.test.ridge =
mse.test.enet = mse.test.rf = mse.test.bag =
mse.test.pcr = mse.test.plsr = rep(0,15)
#loop
set.seed(1234)
for(i in 1:rep){
cat("Processing loop #", i, "\n")
train_index = sample(1:n, n*0.7, replace = FALSE)
ames_train = ames[train_index, ]
ames_test = ames[-train_index, ]
X_train = model.matrix(Sale_Price ~ ., ames_train)[,-1]
Y_train = ames_train$Sale_Price
X_test = model.matrix(Sale_Price ~., ames_test)[,-1]
Y_test = ames_test$Sale_Price
#model3 - linear regression model using all variables
model3 <- glmnet(x=X_train, y=Y_train, lambda = 0, alpha=0)
pred.train.model3 <- predict(model3, X_train)
pred.test.model3 <- predict(model3, X_test)
mse.train.model3[i] <- MAE(pred.train.model3, ames_train$Sale_Price)
mse.test.model3[i] <- MAE(pred.test.model3, ames_test$Sale_Price)
#lasso
cvfitl = cv.glmnet(x=X_train,y=Y_train,family="gaussian",alpha=1,standardize=TRUE)
pred.train.l = predict(cvfitl, newx = X_train, s = "lambda.min")
pred.test.l = predict(cvfitl, newx = X_test, s = "lambda.min")
mse.train.lasso[i] = MAE(pred.train.l, Y_train)
mse.test.lasso[i] = MAE(pred.test.l, Y_test)
#ridge
cvfitr = cv.glmnet(x=X_train,y=Y_train,family="gaussian",alpha=0,standardize=TRUE)
pred.train.r = predict(cvfitr, newx = X_train, s = "lambda.min")
pred.test.r = predict(cvfitr, newx = X_test, s = "lambda.min")
mse.train.ridge[i] = MAE(pred.train.r, Y_train)
mse.test.ridge[i] = MAE(pred.test.r, Y_test)
#ENET
cvfiten = cv.glmnet(x=X_train,y=Y_train,family="gaussian",alpha=0.5,standardize=TRUE)
pred.train.en = predict(cvfiten, newx = X_train, s = "lambda.min")
pred.test.en = predict(cvfiten, newx = X_test, s = "lambda.min")
mse.train.enet[i] = MAE(pred.train.en, Y_train)
mse.test.enet[i] = MAE(pred.test.en, Y_test)
#PCR
numeric_cols = sapply(ames, is.numeric)
ames_numeric = ames[,numeric_cols]
ames_train_pcr = ames_numeric[train_index,]
ames_test_pcr = ames_numeric[-train_index,]
pcr <- pcr(Sale_Price ~., data=ames_train_pcr, scale=TRUE, ncomp=30) #ncomp has been selected through cross-validation <- need to do it again
pcr.predict.train = predict(pcr, ncomp=30)
pcr.predict.test = predict(pcr, newdata=ames_test_pcr, ncomp=30)
mse.train.pcr[i] = MAE(pcr.predict.train, ames_train_pcr$Sale_Price)
mse.test.pcr[i] = MAE(pcr.predict.test, ames_test_pcr$Sale_Price)
#PLSR
plsr <- pls::plsr(Sale_Price ~., data=ames_train_pcr, scale=TRUE, ncomp=6) #ncomp has been selected through cross-validation
plsr.predict.train = predict(plsr, ncomp=6)
plsr.predict.test = predict(plsr, newdata=ames_test_pcr, ncomp=6)
mse.train.plsr[i] = MAE(plsr.predict.train, ames_train_pcr$Sale_Price)
mse.test.plsr[i] = MAE(plsr.predict.test, ames_test_pcr$Sale_Price)
#Random Forest
rf <- randomForest(Sale_Price~., data=ames_train, mtry=8)
rf.predict.train = predict(rf)
rf.predict.test =predict(rf, newdata = ames_test[,!names(ames_test)%in%c("Sale_Price")])
mse.train.rf[i] = MAE(rf.predict.train, ames_train$Sale_Price)
mse.test.rf[i] = MAE(rf.predict.test, ames_test$Sale_Price)
#bagging
bag <- randomForest(Sale_Price~., data=ames_train, mtry=57)
bag.predict.train = predict(bag)
bag.predict.test =predict(bag, newdata = ames_test[,!names(ames_test)%in%c("Sale_Price")])
mse.train.bag[i] = MAE(bag.predict.train, ames_train$Sale_Price)
mse.test.bag[i] = MAE(bag.predict.test, ames_test$Sale_Price)
}
### Boxplot for mse test
mse.test.combined <- c(mse.test.model3, mse.test.lasso, mse.test.ridge,
mse.test.enet, mse.test.pcr, mse.test.plsr,
mse.test.rf,mse.test.bag)
index <- rep(c("test.model3", "test.lasso", "test.ridge",
"test.enet", "test.pcr", "test.plsr",
"test.rf","test.bag"), each=15)
mse.test.combined <- cbind(mse.test.combined, index)
mse.test.combined <- as.data.frame(mse.test.combined)
mse.test.combined$mse.test.combined <- as.numeric(mse.test.combined$mse.test.combined)
ggplot(data=mse.test.combined, aes(x=index, y=mse.test.combined)) + geom_boxplot()
### Boxplot for mse training
mse.train.combined <- c(mse.train.model3, mse.train.lasso, mse.train.ridge,
mse.train.enet, mse.train.pcr, mse.train.plsr,
mse.train.rf, mse.train.bag)
index <- rep(c("train.model3", "train.lasso", "train.ridge",
"train.enet", "train.pcr","train.plsr",
"train.rf", "train.bag"), each=15)
mse.train.combined <- cbind(mse.train.combined, index)
mse.train.combined <- as.data.frame(mse.train.combined)
mse.train.combined$mse.train.combined <- as.numeric(mse.train.combined$mse.train.combined)
ggplot(data=mse.train.combined, aes(x=index, y=mse.train.combined)) + geom_boxplot()
Explanation about Interpreting Machine Learning Models
Explanation about Deep Neural Network(DNN) for Housing Price Prediction
## Initial Deep Learning Model
## Initial model looks like the following
build_model <- function(){ #because we need to instantiate the same model multiple times, we use function to construct it.
model <- keras_model_sequential() %>%
layer_dense(64, activation = "relu", input_shape = ncol(ames_DN_train)) %>%
layer_dense(64, activation = "relu") %>%
layer_dense(1)
model %>% compile(optimizer = optimizer_rmsprop(),
loss = "mse",
metrics = "mae")
model
}
## I went through model fine tuning process, 1) first by manually tweaking hyperparameters 2) second by utilizing grid search
FLAGS <- flags(
# Nodes
flag_numeric("nodes1", 256),
flag_numeric("nodes2", 128),
# Dropout
flag_numeric("dropout1", 0.4),
flag_numeric("dropout2", 0.3),
#learning parameters
flag_string("optimizer", "rmsprop"),
# batch_size
flag_numeric("batch_size", 64)
)
model <- keras_model_sequential() %>%
layer_dense(units = FLAGS$nodes1, activation = "relu", input_shape = ncol(ames_DN_train)) %>%
layer_dropout(rate = FLAGS$dropout1) %>%
layer_dense(units = FLAGS$nodes2, activation = "relu") %>%
layer_dropout(rate = FLAGS$dropout2) %>%
layer_dense(units = 1) %>%
compile(
loss = "mse",
metrics = "mae",
optimizer = FLAG$optimizer
) %>%
fit(
x = ames_DN_train,
y = ames_train_targets,
epochs = 500,
batch_size = FLAGS$batch_size,
validation_split = 0.2,
callbacks = list(
callback_early_stopping(patience = 25)
),
verbose = 0
)
runs <- tuning_run(file = "C:/coding/R/scripts/ames_grid_search.R",
flags = list(
nodes1 = c(64, 128, 256),
nodes2 = c(64, 128, 256),
dropout1 = c(0.2, 0.3, 0.4),
dropout2 = c(0.2, 0.3, 0.4),
optimizer = c("rmsprop", "adam"),
batch_size = c(16,32,64,128)
),
sample=0.7
)
# Rows: 1
# Columns: 28
# $ X <int> 129
# $ run_dir <chr> "runs/2024-06-04T05-23-16Z"
# $ metric_loss <int> 653320960
# $ metric_mae <dbl> 17894.76
# $ metric_val_loss <int> 613711040
# $ metric_val_mae <dbl> 14674.82
# $ flag_nodes1 <int> 128
# $ flag_nodes2 <int> 128
# $ flag_dropout1 <dbl> 0.4
# $ flag_dropout2 <dbl> 0.2
# $ flag_batch_size <int> 16
# $ epochs <int> 500
# $ epochs_completed <int> 113
# $ metrics <chr> "runs/2024-06-04T05-23-16Z/tfruns.d/metrics.json"
# $ model <chr> "Model: \"sequential\"\n________________________________________________________________________________\n Layer (type) ???
# $ loss_function <chr> "mse"
# $ optimizer <chr> "<keras.src.optimizers.rmsprop.RMSprop object at 0x000002A99E9CFBD0>"
# $ learning_rate <dbl> 0.01
# $ script <chr> "ames_grid_search.R"
# $ start <chr> "2024-06-04 05:23:17.35663"
# $ end <chr> "2024-06-04 05:23:40.85267"
# $ completed <lgl> TRUE
# $ output <chr> "\n> #source : https://bradleyboehmke.github.io/HOML/deep-learning.html\n> \n> # FLAGS <- flags(\n> # # Nodes\n> # flag_num???
# $ source_code <chr> "runs/2024-06-04T05-23-16Z/tfruns.d/source.tar.gz"
# $ context <chr> "local"
# $ type <chr> "training"
## Building Deep Learning model; after hyperparameter fine tuning, I found the following best performing DN Model
## As can be seen, I, aided by grid search above, have added l2_regularization, layer_dropout, and adjusted unit number for each layer.
## lastly, I used adam optimizer instead of rmsprop. <-- in the portfolio, I need to briefly explain how each parameter helps.
build_model <- function(){ #because we need to instantiate the same model multiple times, we use function to construct it.
model <- keras_model_sequential() %>%
layer_dense(128, activation = "relu", kernel_regularizer = regularizer_l2(0.001),input_shape = ncol(ames_DN_train)) %>%
layer_dropout(rate=0.4) %>%
layer_dense(128, activation = "relu", kernel_regularizer = regularizer_l2(0.001)) %>%
layer_dropout(rate=0.2) %>%
layer_dense(1)
model %>% compile(optimizer = optimizer_adam(),
loss = "mse",
metrics = "mae")
model
}
#Training the final deep neural network model for 500 epochs using k-fold cross validation
k <- 5
fold_id <- sample(rep(1:k, length.out = nrow(ames_DN_train)))
num_epochs <- 2000
all_mae_histories <- list()
for(i in 1:k) {
cat("Processing fold #", i, "\n")
val_indices <- which(fold_id == i)
val_data <- ames_DN_train[val_indices, ] #prepare the validation data
val_targets <- ames_train_targets[val_indices]
partial_train_data <- ames_DN_train[-val_indices,]
partial_train_targets <- ames_train_targets[-val_indices]
model <- build_model()
history <- model %>% fit(
partial_train_data,
partial_train_targets,
validation_data = list(val_data, val_targets),
epochs = num_epochs,
batch_size = 32,
# callbacks = list(
# #callback_early_stopping(patience = 5),
# callback_reduce_lr_on_plateau(factor = 0.05)
# ),
verbose = 0
)
mae_history <- history$metrics$val_mae
all_mae_histories[[i]] <- mae_history
}
all_mae_histories <- do.call(cbind, all_mae_histories)
plot(history)
average_mae_history <- rowMeans(all_mae_histories) #calculating average per epoch
plot(average_mae_history, xlab="epoch", type = 'l')
truncated_mae_history <- average_mae_history[-(1:200)]
plot(average_mae_history, xlab="epoch", type = 'l', ylim = range(truncated_mae_history))
min <-which.min(average_mae_history)
#[1] 936
average_mae_history[min] #901 = 13907.79
### Best model chosen by grid search
#min<- 950
model <- build_model()
history <- model %>%
fit(ames_DN_train, ames_train_targets, epoch = min*1.2, batch_size = 32)
#validation_split = 0.2)
result <- model %>% evaluate(ames_DN_test, ames_test_targets)
result["mae"]
ames_dn_test_results <- rep(0, 15)
for(i in 1:15) {
cat("Processing Loop #", i, "\n")
model <- build_model()
model %>% fit(ames_DN_train, ames_train_targets, #train it on the entirety of the data
epochs = min*1.2, batch_size = 32, verbose = 0)
result <- model %>% evaluate(ames_DN_test, ames_test_targets)
ames_dn_test_results[i] <- result["mae"]
}
#result from baseline model #this looks better than the model below hmm...
ames_dn_test_results <- c(15005.39, 14925.67, 14870.62,
14981.25, 14965.23, 14908.94,
15131.90, 14907.56, 15028.89,
15025.54, 14952.75, 14925.85,
14945.20, 15219.53, 15198.08)
Explanation about Performance Comparison between Machine Learning and Deep Neural Network
# ensemble of DN and RF? ###
mse.test.rf
ames_dn_test_results
mse.ensemble <- 0.8*ames_dn_test_results + 0.2*mse.test.rf
### After Deep Learning Include the following for the final model comparison
mse.test.combined <- c(mse.test.model3, mse.test.lasso, mse.test.ridge,
mse.test.enet, mse.test.pcr, mse.test.plsr,
mse.test.rf,ames_dn_test_results, mse.ensemble) #mse.test.bag,
index <- rep(c("test.model3", "test.lasso", "test.ridge",
"test.enet", "test.pcr", "test.plsr", #"test.bag",
"test.rf","test.DN", "test.ensemble"),
each=15)
mse.test.combined <- cbind(mse.test.combined, index)
mse.test.combined <- as.data.frame(mse.test.combined)
mse.test.combined$mse.test.combined <- as.numeric(mse.test.combined$mse.test.combined)
ggplot(data=mse.test.combined, aes(x=index, y=mse.test.combined)) + geom_boxplot()
Explanation about Implementing the Same Methods on Boston Housing Dataset
boston <- Boston
str(Boston)
dim(boston)
n = nrow(boston)
rep = 20
mse.train.model1 = mse.train.model2 = mse.train.model3 =
mse.train.lasso = mse.train.ridge = mse.train.enet = rep(0,20) #creating empty vector to save training error for each model
mse.test.model1 = mse.test.model2 = mse.test.model3 =
mse.test.lasso = mse.test.ridge = mse.test.enet = rep(0,20) #creating empty vector to save training error for each model
#loop
set.seed(1234)
for(i in 1:rep){
cat("Processing Loop #", i, "\n\n")
train_index = sample(1:n, n*0.7, replace = FALSE)
boston_train = boston[train_index, ]
boston_test = boston[-train_index, ]
X_train = model.matrix(medv ~ ., boston_train)[,-1]
Y_train = boston_train$medv
X_test = model.matrix(medv ~., boston_test)[,-1]
Y_test = boston_test$medv
#model1
model1 = lm(medv ~ rm, data = boston_train)
pred.train.model1 = predict(model1)
pred.test.model1 = predict(model1, newdata=boston_test[,!names(boston_test)%in%c("medv")])
mse.train.model1[i] = MAE(pred.train.model1, boston_train$medv)
#sqrt(sum((boston_train$medv - pred.train.model1)^2)/length(boston_train$medv))
mse.test.model1[i] = MAE(pred.test.model1, boston_test$medv)
#sqrt(sum((boston_test$medv - pred.test.model1)^2)/length(boston_test$medv))
#model2
model2 = lm(medv ~ rm + tax + ptratio + black + chas, data = boston_train)
pred.train.model2 = predict(model2)
pred.test.model2 = predict(model2, newdata=boston_test[,!names(boston_test)%in%c("medv")])
mse.train.model2[i] = MAE(pred.train.model2, boston_train$medv)
#sqrt(sum((boston_train$medv - pred.train.model2)^2)/length(boston_train$medv))
mse.test.model2[i] = MAE(pred.test.model2, boston_test$medv)
#sqrt(sum((boston_test$medv - pred.test.model2)^2)/length(boston_test$medv))
#model3
model3 <- glmnet(x=X_train, y=Y_train, lambda = 0, alpha=0)
pred.train.model3 <- predict(model3, X_train)
pred.test.model3 <- predict(model3, X_test)
mse.train.model3[i] <- MAE(pred.train.model3, boston_train$medv)
mse.test.model3[i] <- MAE(pred.test.model3, boston_test$medv)
#lasso
cvfitl = cv.glmnet(x=X_train,y=Y_train,family="gaussian",alpha=1,standardize=TRUE)
pred.train.l = predict(cvfitl, newx = X_train, s = "lambda.min")
pred.test.l = predict(cvfitl, newx = X_test, s = "lambda.min")
mse.train.lasso[i] = MAE(pred.train.l, Y_train)
mse.test.lasso[i] = MAE(pred.test.l, Y_test)
#ridge
cvfitr = cv.glmnet(x=X_train,y=Y_train,family="gaussian",alpha=0,standardize=TRUE)
pred.train.r = predict(cvfitr, newx = X_train, s = "lambda.min")
pred.test.r = predict(cvfitr, newx = X_test, s = "lambda.min")
mse.train.ridge[i] = MAE(pred.train.r, Y_train)
mse.test.ridge[i] = MAE(pred.test.r, Y_test)
#ENET
cvfiten = cv.glmnet(x=X_train,y=Y_train,family="gaussian",alpha=0.5,standardize=TRUE)
pred.train.en = predict(cvfiten, newx = X_train, s = "lambda.min")
pred.test.en = predict(cvfiten, newx = X_test, s = "lambda.min")
mse.train.enet[i] = MAE(pred.train.en, Y_train)
mse.test.enet[i] = MAE(pred.test.en, Y_test)
}
mse.train.model1
mse.train.model2
mse.train.model3
mse.train.lasso
mse.train.ridge
mse.train.enet
mse.test.model1
mse.test.model2
mse.test.model3
mse.test.lasso
mse.test.ridge
mse.test.enet
### Boxplot for mse test
mse.test.combined <- c(mse.test.model1, mse.test.model2, mse.test.model3, mse.test.lasso, mse.test.ridge, mse.test.enet)
index <- rep(c("test.model1", "test.model2", "test.model3", "test.lasso", "test.ridge", "test.enet"), each=20)
mse.test.combined <- cbind(mse.test.combined, index)
mse.test.combined <- as.data.frame(mse.test.combined)
mse.test.combined$mse.test.combined <- as.numeric(mse.test.combined$mse.test.combined)
ggplot(data=mse.test.combined, aes(x=index, y=mse.test.combined)) + geom_boxplot()
### Boxplot for mse training
mse.train.combined <- c(mse.train.model1, mse.train.model2, mse.train.model3, mse.train.lasso, mse.train.ridge, mse.train.enet)
index <- rep(c("train.model1", "train.model2", "train.model3", "train.lasso", "train.ridge", "train.enet"), each=20)
mse.train.combined <- cbind(mse.train.combined, index)
mse.train.combined <- as.data.frame(mse.train.combined)
mse.train.combined$mse.train.combined <- as.numeric(mse.train.combined$mse.train.combined)
ggplot(data=mse.train.combined, aes(x=index, y=mse.train.combined)) + geom_boxplot()
Explanation about Deep Neural Network
#Deep Learning...with Boston data first ->, then move on to using AMES dataset. This may more complicated process
#because AMES dataset contains a lot of categorical variable. I need to make sure how I should preprocess all those variables.
boston <- Boston
names(boston)
predictors <- as.matrix(boston[,as.numeric(1:13)])
targets <- as.matrix(boston[ ,14])
n <- nrow(boston)
set.seed(1234)
train_index <- sample(1:n, 404, replace=FALSE)
train_data <- predictors[train_index,]
train_targets <- targets[train_index]
test_data <- predictors[-train_index,]
test_targets <- targets[-train_index]
mean <- apply(train_data, 2, mean)
sd <- apply(train_data, 2, sd)
train_data <- scale(train_data, center = mean, scale = sd)
test_data <- scale(test_data, center =mean, scale = sd)
build_model <- function(){ #because we need to instantiate the same model multiple times, we use function to construct it.
model <- keras_model_sequential() %>%
layer_dense(64, activation = "relu") %>%
#layer_dropout(rate = 0.2) %>%
#layer_batch_normalization() %>%
layer_dense(64, activation = "relu") %>%
#layer_dropout(rate = 0.2) %>%
#layer_batch_normalization() %>%
layer_dense(1)
#, kernel_regularizer = regularizer_l2(0.001)
model %>% compile(optimizer = "rmsprop",
loss = "mse",
metrics = "mae")
model
}
#Let's try training the model a bit longer: 500 epochs
#k-fold validation
k <- 4
fold_id <- sample(rep(1:k, length.out = nrow(train_data)))
num_epochs <- 500
all_mae_histories <- list()
for(i in 1:k) {
cat("Processing fold #", i, "\n")
val_indices <- which(fold_id == i)
val_data <- train_data[val_indices, ] #prepare the validation data
val_targets <- train_targets[val_indices]
partial_train_data <- train_data[-val_indices,]
partial_train_targets <- train_targets[-val_indices]
model <- build_model()
history <- model %>% fit(
partial_train_data,
partial_train_targets,
validation_data = list(val_data, val_targets),
epochs = num_epochs,
batch_size = 16,
verbose = 0
)
mae_history <- history$metrics$val_mae
all_mae_histories[[i]] <- mae_history
}
all_mae_histories <- do.call(cbind, all_mae_histories)
plot(history)
average_mae_history <- rowMeans(all_mae_histories) #calculating average per epoch
plot(average_mae_history, xlab="epoch", type = 'l')
truncated_mae_history <- average_mae_history[-(1:20)]
plot(average_mae_history, xlab="epoch", type = 'l',
ylim = range(truncated_mae_history))
min = which.min(average_mae_history)
#training the final model
model <- build_model()
history <- model %>% fit(train_data, train_targets, #train it on the entirety of the data
epochs = min, batch_size = 16)
result <- model %>% evaluate(test_data, test_targets)
result["mae"]
DN.test.mae <- rep(0,20)
for(i in 1:20){
cat("Loop #", i, "\n")
model <- build_model()
model %>% fit(train_data, train_targets, #train it on the entirety of the data
epochs = min*1.2, batch_size = 16, verbose = 0)
result <- model %>% evaluate(test_data, test_targets)
DN.test.mae[i] <- result["mae"]
}
DN.test.mae
mean(DN.test.mae)
### Boxplot for mse test
mse.test.combined <- c(mse.test.model1, mse.test.model2, mse.test.model3,
mse.test.lasso, mse.test.ridge, mse.test.enet, DN.test.mae)
index <- rep(c("test.model1", "test.model2", "test.model3", "test.lasso", "test.ridge", "test.enet", "test.DN"), each=20)
mse.test.combined <- cbind(mse.test.combined, index)
mse.test.combined <- as.data.frame(mse.test.combined)
mse.test.combined$mse.test.combined <- as.numeric(mse.test.combined$mse.test.combined)
ggplot(data=mse.test.combined, aes(x=index, y=mse.test.combined)) + geom_boxplot()